In [ ]:
#parameter:
database="wos12b"
searchterm="big data" #lowecase!
In [ ]:
import cx_Oracle #ensure that OS, InstantClient (Basic, ODBC, SDK) and cx_Oracle are all 64 bit. Install with "pip install cx_Oracle". Add link to InstantClient in Path variable!
import pandas as pd
import plotly #cmd: conda install plotly
plotly.__version__
# (*) To communicate with Plotly's server, sign in with credentials file
import plotly.plotly as py
import cufflinks as cf
# (*) Useful Python/Plotly tools
import random
from plotly.grid_objs import Grid, Column
import time
In [ ]:
dsn_tns=cx_Oracle.makedsn('127.0.0.1','6025',service_name='bibliodb01.fiz.karlsruhe')
#open connection:
db=cx_Oracle.connect(<username>, <password>, dsn_tns)
print(db.version)
In [ ]:
#%% functions:
def read_query(connection, query):
cursor = connection.cursor()
try:
cursor.execute( query )
names = [ x[0] for x in cursor.description]
rows = cursor.fetchall()
return pd.DataFrame( rows, columns=names)
finally:
if cursor is not None:
cursor.close()
def array_to_string(arr):
returnstring="("
firstval=True
for value in arr:
if firstval:
returnstring = returnstring + "'"+str(value)+"'"
firstval=False
else:
returnstring = returnstring + ",'"+str(value)+"'"
returnstring=returnstring+")"
return returnstring
def get_random_color(pastel_factor = 0.5):
return [(x+pastel_factor)/(1.0+pastel_factor) for x in [random.uniform(0,1.0) for i in [1,2,3]]]
def color_distance(c1,c2):
return sum([abs(x[0]-x[1]) for x in zip(c1,c2)])
def generate_new_color(existing_colors,pastel_factor = 0.5):
max_distance = None
best_color = None
for i in range(0,100):
color = get_random_color(pastel_factor = pastel_factor)
if not existing_colors:
return color
best_distance = min([color_distance(color,c) for c in existing_colors])
if not max_distance or best_distance > max_distance:
max_distance = best_distance
best_color = color
return best_color
def get_all_papers_json(year,class_category):
yearint=int(year)
#get right papers:
return dataset_original[(dataset_original['PUBYEAR']==yearint) & (dataset_original[CONSTGrouping]==class_category)].loc[:,['DOI','ARTICLE_TITLE','PUBYEAR','KEYWORD']].drop_duplicates().to_json(orient='records')
def get_all_papers(year,class_category): #html table!
yearint=int(year)
#get right papers:
outputdf=dataset_original[(dataset_original['PUBYEAR']==yearint) & (dataset_original[CONSTGrouping]==class_category)].loc[:,['DOI','ARTICLE_TITLE','PUBYEAR','KEYWORD']].drop_duplicates()
outputdf['DOI']='<a href="http://dx.doi.org/' + outputdf['DOI'] + '" target="_blank">'+ outputdf['DOI'] +'</a>'
#set displaywith to max, as to_html would truncate otherwise:
old_width = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1)
returnstring=outputdf.to_html(index=False,escape=False)
pd.set_option('display.max_colwidth', old_width)
return returnstring
def make_hover_text(year,class_category):
yearint=int(year)
returntext='%s\
<br>Number of Papers: %s \
<br>Average number of authors: %s \
<br>Average number references: %s \
<br>Average number of authors\' countries: %s \
<br>Average number of authors\' institutions: %s \
<br>Average number of citations: %s ' \
% (getvalue(yearint,class_category,CONSTGrouping)
, getvalue(yearint,class_category,'NUM_PAPERS')
, getvalue(yearint,class_category,'AUTHOR_MEAN')
, getvalue(yearint,class_category,'REF_MEAN')
, getvalue(yearint,class_category,'COUNTRY_MEAN')
, getvalue(yearint,class_category,'INST_MEAN')
, getvalue(yearint,class_category,'CIT_MEAN'))
print(returntext)
return returntext
def getvalue(year,class_category,attribute):
returnval=dataset[(dataset['PUBYEAR']==year) & (dataset[CONSTGrouping]==class_category)][attribute]
if returnval.empty:
return ""
else:
if isinstance(returnval.values[0], str):
return returnval.values[0]
else:
return round(returnval.values[0],2)
note that access to the db is restricted to members of the competence centre of bibliometrics (http://www.bibliometrie.info/) and its cooperation partners. However, you can continue with the csv stored at the end of this block.
In [ ]:
#%% Load all existing keywords with the term big data in it (and their corresponding primary keys:
command="""SELECT kw,pk_kw FROM
(SELECT lower(wos12b.KEYWORDS.KEYWORD) kw,PK_KEYWORDS pk_kw
FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS
WHERE
"""+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS
AND lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%'
)
GROUP BY kw,pk_kw
ORDER BY kw DESC
"""
df=read_query(db,command)
# we already matched some different forms of writing keywords by merging all
# to lcase. Remove " and ' for further merging:
df['KW'].replace(regex=True,inplace=True,to_replace=r'\'',value=r'')
df['KW'].replace(regex=True,inplace=True,to_replace=r'\"',value=r'')
df= df.sort_values(by=['KW', 'PK_KW'], ascending=[1, 1])
df.to_csv("keywords.csv", sep=';')
#%% get list of single keywords:
kw_unique=df['KW'].unique()
# get ids per keyword:
indextable ={}
for keyword in kw_unique:
arrkeyword_ids=df[df['KW']==keyword]['PK_KW'].unique()
strkeyword_ids=array_to_string(arrkeyword_ids)
indextable[keyword]=arrkeyword_ids
#%% now get all papers
df2_all_keywords = pd.DataFrame()
i=1
print('loading papers for keyword:')
for keyword in kw_unique:
print('...'+keyword + ' ('+str(i)+' of '+str(len(kw_unique))+')')
arrkeyword_ids=df[df['KW']==keyword]['PK_KW'].unique()
strkeyword_ids=array_to_string(arrkeyword_ids)
command="""SELECT ROUND(PUBYEAR,0) as PUBYEAR, CLASSIFICATION, DOI, ARTICLE_TITLE,
AUTHOR_CNT,
REF_CNT,
COUNTRY_CNT,
INST_CNT,
cc.COUNT AS CIT_CNT
FROM
"""+database+""".ITEMS i,
"""+database+""".ITEMS_KEYWORDS i_k,
"""+database+""".CITINGCOUNTS cc,
"""+database+""".ITEMS_CLASSIFICATIONS i_c,
"""+database+""".CLASSIFICATIONS c
WHERE i.PK_ITEMS=i_k.FK_ITEMS
AND i_c.FK_ITEMS=i.PK_ITEMS
AND i_c.FK_CLASSIFICATIONS=c.PK_CLASSIFICATIONS
AND cc.FK_ITEMS=i.PK_ITEMS
AND PUBTYPE='J'
AND FK_KEYWORDS IN
"""+strkeyword_ids+"""
"""
df2=read_query(db,command)
df2['KEYWORD']=keyword
df2_all_keywords=df2_all_keywords.append(df2)
i=i+1
df2_all_keywords.PUBYEAR=df2_all_keywords.PUBYEAR.astype(int)
df2_all_keywords.to_csv("datafromdb.csv", sep=';')
In [ ]:
CONSTGrouping="KEYWORD" #"CLASSIFICATION
dataset_original=pd.read_csv("datafromdb.csv",sep=";")
dataset_original= dataset_original[dataset_original.PUBYEAR != 2015]
dataset_original=dataset_original.drop(dataset_original.columns[[0]], axis=1)
#group by to get averages:
dataset=dataset_original.drop_duplicates(['PUBYEAR',CONSTGrouping,'DOI','ARTICLE_TITLE'])
grouped=dataset.groupby(['PUBYEAR',CONSTGrouping])
dataset=grouped.agg('mean',)\
.rename(columns = lambda x: x.replace("_CNT","_MEAN"))\
.join(pd.DataFrame(grouped.size(),
columns=['NUM_PAPERS'])).reset_index()
In [ ]:
years_from_col = set(round(dataset['PUBYEAR'],0))
years_ints = sorted(list(years_from_col))
years = [str(year) for year in years_ints]
# make list of class_categories (can be 'KEYWORD' or 'CLASSIFICATION')
class_categories = []
for class_category in dataset[CONSTGrouping]:
if class_category not in class_categories:
class_categories.append(class_category)
columns = []
# make grid
for year in years:
for class_category in class_categories:
dataset_by_year = dataset[dataset['PUBYEAR'] == int(year)]
dataset_by_year_and_keyw = dataset_by_year[dataset_by_year[CONSTGrouping] == class_category]
for col_name in dataset_by_year_and_keyw:
# each column name is unique
column_name = '{year}_{class_category}_{header}_grid'.format(
year=year, class_category=class_category, header=col_name
)
a_column = Column(list(dataset_by_year_and_keyw[col_name]), column_name)
columns.append(a_column)
# upload grid
grid = Grid(columns)
url = py.grid_ops.upload(grid, 'grid'+str(time.time()), auto_open=False)
url
In [ ]:
CONSTxaxis='INST_MEAN'
CONSTyaxis='REF_MEAN'
CONSTsize='NUM_PAPERS'
#AUTHOR_MEAN, REF_MEAN, COUNTRY_MEAN, INST_MEAN, CIT_MEAN
CONSTduration=1500
#make figure:
figure = {
'data': [],
'layout': {},
'frames': [],
'config': {'scrollzoom': True}
}
# fill in most of layout
figure['layout']['xaxis'] = {'range': [0, 12], 'title': CONSTxaxis, 'gridcolor': '#FFFFFF'}
figure['layout']['yaxis'] = {'range': [-5, 100],'title': CONSTyaxis, 'gridcolor': '#FFFFFF'}
figure['layout']['hovermode'] = 'closest'
figure['layout']['plot_bgcolor'] = 'rgb(240, 245, 250)'
figure['layout']['margin'] = {'r': 290}
figure['layout']['slider'] = {
'args': [
'slider.value', {
'duration': CONSTduration,
'ease': 'cubic-in-out'
}
],
'initialValue': '2009',
'plotlycommand': 'animate',
'values': years,
'visible': True
}
figure['layout']['showlegend']=True;
figure['layout']['legend']={'x':1,'y':1,'font':{'size':10}};
figure['layout']['updatemenus'] = [
{
'buttons': [
{
'args': [None, {'frame': {'duration': CONSTduration, 'redraw': True},
'fromcurrent': True, 'transition': {'duration': CONSTduration, 'easing': 'quadratic-in-out'}}],
'label': 'Play',
'method': 'animate'
},
{
'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate',
'transition': {'duration': 0}}],
'label': 'Pause',
'method': 'animate'
}
],
'direction': 'left',
'pad': {'r': 10, 't': 87},
'showactive': False,
'type': 'buttons',
'x': 0.1,
'xanchor': 'right',
'y': 0,
'yanchor': 'top'
}
]
sliders_dict = {
'active': 0,
'yanchor': 'top',
'xanchor': 'left',
'currentvalue': {
'font': {'size': 20},
'prefix': 'Year:',
'visible': True,
'xanchor': 'right'
},
'transition': {'duration': 300, 'easing': 'cubic-in-out'},
'pad': {'b': 10, 't': 50},
'len': 0.9,
'x': 0.1,
'y': 0,
'steps': []
}
custom_colors={}
for class_category in dataset[CONSTGrouping].unique():
r = lambda: random.randint(0,255)
custom_colors[class_category]='rgb('+str(r())+','+str(r())+','+str(r())+')'
In [ ]:
col_name_template = '{year}_{class_category}_{header}_grid'
year = 2009
for class_category in class_categories:
data_dict = {
'xsrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTxaxis
)),
'ysrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTyaxis
)),
'mode': 'markers',
'text': make_hover_text(year,class_category),
'paper':get_all_papers(year,class_category),
'hoverinfo':'text',
'marker': {
'sizemode': 'area',
'sizeref': 0.02,
'sizesrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTsize
)),
'color': custom_colors[class_category]
},
'name': class_category[:50]
}
figure['data'].append(data_dict)
In [ ]:
for year in years:
frame = {'data': [], 'name': str(year)}
for class_category in class_categories:
data_dict = {
'xsrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTxaxis
)),
'ysrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTyaxis
)),
'mode': 'markers',
'text': make_hover_text(year,class_category),
'paper':get_all_papers(year,class_category),
'hoverinfo':'text',
#'textsrc': grid.get_column_reference(col_name_template.format(
# year=year, class_category=class_category, header=CONSTGrouping
# )),
'marker': {
'sizemode': 'area',
'sizeref': 0.02,
'sizesrc': grid.get_column_reference(col_name_template.format(
year=year, class_category=class_category, header=CONSTsize
)),
'color': custom_colors[class_category],
'opacity':0.6
},
'name': class_category[:50]
}
frame['data'].append(data_dict)
figure['frames'].append(frame)
slider_step = {'args': [
[year],
{'frame': {'duration': CONSTduration, 'redraw': True},
'mode': 'immediate',
'transition': {'duration': CONSTduration}}
],
'label': year,
'method': 'animate'}
sliders_dict['steps'].append(slider_step)
In [ ]:
figure['layout']['sliders'] = [sliders_dict]
#plot(figure)
py.icreate_animations(figure, 'BigDataPapers'+str(time.time()),auto_open=True)